library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(Synth)
## Warning: package 'Synth' was built under R version 4.3.3
## ##
## ## Synth Package: Implements Synthetic Control Methods.
## ## See https://web.stanford.edu/~jhain/synthpage.html for additional information.
library(geosphere)
## Warning: package 'geosphere' was built under R version 4.3.3
library(estimatr)
## Warning: package 'estimatr' was built under R version 4.3.3
library(fixest)
## Warning: package 'fixest' was built under R version 4.3.3
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.3.2
df_combined <- read_csv("df_combined_complete.csv")
## Rows: 121693 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): town_code, month, dia, station_name, address, station_type, nombr...
## dbl  (22): provincia, station_code.x, punto_muestreo, year, air_quality, cit...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_combined <- df_combined |>  drop_na(air_quality)

df_combined<- df_combined |> 
  filter(as.Date(fecha) <= as.Date("2021-06-30"))

#air quality monitoring stations in Madrid are divided into four zones under current regulations
df_combined <- df_combined |> 
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zone 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    station_name %in% c("Valdemoro", "Arganda del Rey", "Collado Villalba", "Algete", "Colmenar Viejo", "Alcala de Henares") ~ "Control",
    TRUE ~ "Excluded"
  ))

I again confirm that the assumption of parallel trends does not hold, using monthly data.

#MONTHLY
monthly_avg <- df_combined %>%
  group_by(year, month, zone) %>%
  summarise(mean_air_quality = mean(air_quality, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
monthly_avg <- monthly_avg %>%
  mutate(date = as.Date(paste(year, month, "01", sep = "-"), format = "%Y-%m-%d"))

ggplot(monthly_avg, aes(x = date, y = mean_air_quality, color = zone)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2018-11-01"), linetype = "dashed", color = "red", size = 1) +
  labs(title = "Serie Temporal de la Contaminación Media Mensual por Año y Tratamiento",
       x = "Fecha",
       y = "Contaminación Media (air_quality)",
       color = "zone") +
  theme_minimal() +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Map to visualize monitoring stations

#MAP

color_pal <- colorFactor(
  palette = c("red", "grey", "lightblue", "orange", "purple", "yellow"),
  domain = c("Control", "Excluded", "Zone 1", "Zone 2", "Zone 3", "Zone 4")
)


map <- leaflet(df_combined) |> 
  addProviderTiles(providers$CartoDB.Positron) |> 
  setView(lng = median(df_combined$longitude), lat = median(df_combined$latitude), zoom = 12) |> 
  # Añadir círculos con color basado en la variable 'zone'
  addCircles(
    lng = ~longitude, 
    lat = ~latitude,
    color = ~color_pal(zone),
    fillOpacity = 0.7,
    radius = 5
  ) |> 
  addLabelOnlyMarkers(
    lng = median(df_combined$longitude) - 0.3, 
    lat = median(df_combined$latitude) + 0.1,
    labelOptions = labelOptions(textsize = "20px", noHide = TRUE, textOnly = TRUE)
  ) |> 
  addLegend(
    position = "topright",
    pal = color_pal,
    values = c("Zone 1", "Zone 2", "Zone 3", "Zone 4", "Excluded", "Control"),
    title = "Zone",
    opacity = 1
  )

# Mostrar el mapa
map
  1. DIFFERENCES IN DIFFERENCES APPROACH

1.1 MADRID MUNICIPALITY

df_combined <- df_combined |> 
  filter(zone != "Excluded")

monthly_avg <- df_combined |> 
  group_by(year, month, zone) |> 
  summarise(mean_air_quality = mean(air_quality, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
df_combined <- df_combined |> 
  filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31"))) #omitted covid period

df_combined <- df_combined |>  filter(as.Date(fecha) <= as.Date("2021-06-30"))

model0 <- feols(air_quality ~ city_center + post_treat_mad_central + city_center*post_treat_mad_central + prec + dir + velmedia + racha +year + zone,  data = df_combined, cluster = ~zone)
## The variable 'zoneZone 4' has been removed because of collinearity (see $collin.var).
summary(model0)
## OLS estimation, Dep. Var.: air_quality
## Observations: 43,798
## Standard-errors: Clustered (zone) 
##                                       Estimate Std. Error    t value   Pr(>|t|)
## (Intercept)                        7488.794175 344.801972  21.719116 2.6587e-05
## city_center                          -8.517192   0.560793 -15.187766 1.0958e-04
## post_treat_mad_central                5.384099   0.384141  14.015927 1.5034e-04
## prec                                 -0.071051   0.030686  -2.315440 8.1544e-02
## dir                                   0.001942   0.007973   0.243523 8.1958e-01
## velmedia                             -2.595358   0.161292 -16.091069 8.7239e-05
## racha                                -1.214888   0.202946  -5.986253 3.9153e-03
## year                                 -3.690046   0.170816 -21.602521 2.7162e-05
## zoneZone 1                           22.343871   0.237664  94.014371 7.6744e-08
## zoneZone 2                           23.281388   0.253276  91.920995 8.3975e-08
## zoneZone 3                           18.740243   0.510002  36.745455 3.2749e-06
## city_center:post_treat_mad_central   -1.344124   0.706985  -1.901206 1.3006e-01
##                                       
## (Intercept)                        ***
## city_center                        ***
## post_treat_mad_central             ***
## prec                               .  
## dir                                   
## velmedia                           ***
## racha                              ** 
## year                               ***
## zoneZone 1                         ***
## zoneZone 2                         ***
## zoneZone 3                         ***
## city_center:post_treat_mad_central    
## ... 1 variable was removed because of collinearity (zoneZone 4)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2   Adj. R2: 0.327826

1.2 Assesing heterogeneity treatment effects. Differences in differences approach by zone.

df_combined<- df_combined |> 
  mutate(year = as.factor(year), zone = as.factor(zone), post = as.factor(post_treat_mad_central))


df_combined <- df_combined |> 
  mutate(
    post = as.numeric(as.character(post)),
    year = as.numeric(as.character(year))
  )


df_combined <- df_combined |> 
  mutate(
    treatment_zone1 = ifelse(zone == "Zone 1", 1, 0),
    treatment_zone2 = ifelse(zone == "Zone 2", 1, 0),
    treatment_zone3 = ifelse(zone == "Zone 3", 1, 0),
    treatment_zone4 = ifelse(zone == "Zone 4", 1, 0))
    
  
df_combined <- df_combined %>%
  mutate(
    POST_treat_1 = post * treatment_zone1,
    POST_treat_2 = post * treatment_zone2,
    POST_treat_3 = post * treatment_zone3,
    POST_treat_4 = post * treatment_zone4,
  )


model1 <- feols(air_quality ~ post + treatment_zone1 + treatment_zone2 + treatment_zone3 + treatment_zone4 + POST_treat_1  + POST_treat_2 + POST_treat_3 + POST_treat_4 + prec + dir + velmedia + racha + year,  data = df_combined, cluster = ~zone)
summary(model1)
## OLS estimation, Dep. Var.: air_quality
## Observations: 43,798
## Standard-errors: Clustered (zone) 
##                    Estimate Std. Error    t value   Pr(>|t|)    
## (Intercept)     7494.951212 339.824579  22.055354 2.5013e-05 ***
## post               5.393797   0.377512  14.287741 1.3939e-04 ***
## treatment_zone1   14.469147   0.250641  57.728554 5.3916e-07 ***
## treatment_zone2   14.367790   0.232390  61.826304 4.0992e-07 ***
## treatment_zone3   10.263769   0.150640  68.134516 2.7801e-07 ***
## treatment_zone4  -10.538701   0.439363 -23.986300 1.7918e-05 ***
## POST_treat_1      -2.561131   0.154686 -16.556935 7.7937e-05 ***
## POST_treat_2      -0.571681   0.147279  -3.881627 1.7817e-02 *  
## POST_treat_3      -1.417991   0.120702 -11.747869 3.0035e-04 ***
## POST_treat_4       2.548084   0.078853  32.314438 5.4676e-06 ***
## prec              -0.070765   0.030658  -2.308220 8.2193e-02 .  
## dir                0.002055   0.007973   0.257692 8.0936e-01    
## velmedia          -2.583066   0.165140 -15.641672 9.7561e-05 ***
## racha             -1.217143   0.203216  -5.989392 3.9078e-03 ** 
## year              -3.693107   0.168336 -21.938964 2.5544e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2   Adj. R2: 0.328812
  1. Counterfactual Estimation Using Predictive Models
data_post_policy <- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV") |> 
  filter(fecha < as.Date("2021-05-01")) |> 
  filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31")))
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): dia, station_name, station_type
## dbl  (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ajustar el nombre de las zonas
data_post_policy <- data_post_policy |> 
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    TRUE ~ NA_character_
  ))

DESCRIPTIVE STATISTICS FOR MADRID

mean_no2 <- data_post_policy |> 
  summarise(
    mean_no2= mean(air_quality, na.rm = TRUE),
    sd_no2 = sd(air_quality, na.rm = TRUE),
    mean_tmed=mean(tmed),
    sd_tmed=sd(tmed),
    mean_prec=mean(prec),
    sd_prec=sd(prec),
    mean_prec=mean(prec),
    sd_prec=sd(prec),
    mean_dir=mean(dir),
    sd_dir=sd(dir)
    
  )
print(mean_no2)
## # A tibble: 1 × 8
##   mean_no2 sd_no2 mean_tmed sd_tmed mean_prec sd_prec mean_dir sd_dir
##      <dbl>  <dbl>     <dbl>   <dbl>     <dbl>   <dbl>    <dbl>  <dbl>
## 1     34.3   20.1      13.6    7.30      1.18    3.99     26.8   25.2

DESCRIPTIVE STATISTICS BY ZONE

mean_no2_zone<- data_post_policy |> 
  group_by(zone) |> 
  summarise(
    mean_no2= mean(air_quality, na.rm = TRUE),
    sd_no2 = sd(air_quality, na.rm = TRUE),
    mean_tmed=mean(tmed),
    sd_tmed=sd(tmed),
    mean_prec=mean(prec),
    sd_prec=sd(prec),
    mean_prec=mean(prec),
    sd_prec=sd(prec),
    mean_racha=mean(racha),
    sd_racha=sd(racha)
    
  )
print(mean_no2_zone)
## # A tibble: 4 × 9
##   zone   mean_no2 sd_no2 mean_tmed sd_tmed mean_prec sd_prec mean_racha sd_racha
##   <chr>     <dbl>  <dbl>     <dbl>   <dbl>     <dbl>   <dbl>      <dbl>    <dbl>
## 1 Zona 2     38.4   20.9      14.2    7.47      1.2     4.06       9.56     3.96
## 2 Zone 1     36.5   20.1      13.5    7.19      1.20    4.11       9.56     4.02
## 3 Zone 3     31.9   18.6      13.4    7.37      1.12    3.79       9.97     4.33
## 4 Zone 4     18.7   13.1      12.6    7.05      1.11    3.74       8.53     4.01

2.1 AVERAGE TREATMENT ON TREATED (ATT) ESTIMATION

# Calculate the difference between observed and predicted air quality
data_with_difference <- data_post_policy |> 
  mutate(
    difference_air_quality_predicted = air_quality - predicted_air_quality
  )

# Calculate the overall mean of the difference <- ATT (AVERAGE TREATMENT ON TREATET)
mean_difference_general <- data_with_difference |> 
  summarise(
    mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
  )

print(mean_difference_general)
## # A tibble: 1 × 1
##   mean_difference
##             <dbl>
## 1           -3.25

Calculating standar errors as Souza(2019) proposed

# Ajustar la diferencia restando la media general
data_with_adjusted_difference <- data_with_difference |> 
  mutate(
    adjusted_difference = difference_air_quality_predicted - mean_difference_general$mean_difference
  )

# Calcular el sumatorio de las diferencias ajustadas
sum_adjusted_difference_general <- data_with_adjusted_difference |> 
  summarise(
    sum_adjusted_difference = sum(adjusted_difference, na.rm = TRUE)
  )

# Leer datos de entrenamiento
train_data_20 <- read_csv("train_data_20.csv") |> 
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    TRUE ~ NA_character_
  ))
## Rows: 5028 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (2): station_name, station_type
## dbl  (13): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Calcular el sumatorio de residuos de validación cruzada
sumatorio_cv_residuos_general <- train_data_20 |> 
  summarise(
    sumatorio_cv_residuos = sum(cv_residuos, na.rm = TRUE)
  )

# Calcular la varianza y el error estándar
standar_error_cv_general <- sum_adjusted_difference_general |> 
  mutate(
    variance_cv = (sum_adjusted_difference + sumatorio_cv_residuos_general$sumatorio_cv_residuos)^2,
    se_cv = sqrt(variance_cv)
  )

# Mostrar los resultados
print(standar_error_cv_general)
## # A tibble: 1 × 3
##   sum_adjusted_difference variance_cv se_cv
##                     <dbl>       <dbl> <dbl>
## 1                1.07e-11        2.60  1.61
print(mean_difference_general)
## # A tibble: 1 × 1
##   mean_difference
##             <dbl>
## 1           -3.25

2.2 CONDITIONAL AVERAGE TREATMENT ON TREATED (C ATT) ESTIMATION BY ZONE

data_post_policy<- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV")
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): dia, station_name, station_type
## dbl  (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_post_policy<- data_post_policy |> 
  filter(fecha < as.Date("2021-05-01"))

data_post_policy <- data_post_policy |> 
  filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31")))

data_post_policy <- data_post_policy |> 
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    TRUE ~ NA_character_
  ))
data_with_difference <- data_post_policy |> 
  mutate(
    difference_air_quality_predicted = air_quality - predicted_air_quality
  )

mean_difference_by_zone <- data_with_difference |> 
  group_by(zone) |> 
  summarise(
    mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
  )
print(mean_difference_by_zone) #C ATT BY ZONE
## # A tibble: 4 × 2
##   zone   mean_difference
##   <chr>            <dbl>
## 1 Zona 2          -2.28 
## 2 Zone 1          -4.39 
## 3 Zone 3          -3.35 
## 4 Zone 4          -0.130

Calculating Standar Errors by zone as proposed by (Souza, 2019)

train_data_20<- read_csv("train_data_20.csv")
## Rows: 5028 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (2): station_name, station_type
## dbl  (13): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_data_20 <- train_data_20 %>%
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    TRUE ~ NA_character_
  ))

data_with_difference <- data_post_policy |> 
  mutate(
    difference_air_quality_predicted = air_quality - predicted_air_quality
  )

mean_difference_by_zone <- data_with_difference |> 
  group_by(zone) |> 
  summarise(
    mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
  )

# Step 3: Subtract the overall mean from each value in the difference column for each zone category
data_with_adjusted_difference <- data_with_difference |> 
  left_join(mean_difference_by_zone, by = "zone") |> 
  mutate(
    adjusted_difference = difference_air_quality_predicted - mean_difference
  )

# Step 4: Calculate the sum of adjusted differences for each zone category
sum_adjusted_difference_by_zone <- data_with_adjusted_difference |> 
  group_by(zone) |> 
  summarise(
    sum_adjusted_difference = sum(adjusted_difference, na.rm = TRUE)
  )

# Step 5: Calculate the sum of cv_residuos
sumatorio_cv_residuos_by_zone <- train_data_20 |> 
  group_by(zone) |> 
  summarise(
    sumatorio_cv_residuos = sum(cv_residuos, na.rm = TRUE)
  )

standar_error_cv_by_zone <- sum_adjusted_difference_by_zone |> 
  left_join(sumatorio_cv_residuos_by_zone, by = "zone") |> 
  mutate(
    variance_cv = (sum_adjusted_difference + sumatorio_cv_residuos)^2
  )

# Step 2: Calculate the standard error from the variance
standar_error_cv_by_zone <- standar_error_cv_by_zone |> 
  mutate(
    se_cv = sqrt(variance_cv)
  )

# Display the results
print(standar_error_cv_by_zone)
## # A tibble: 4 × 5
##   zone   sum_adjusted_difference sumatorio_cv_residuos variance_cv se_cv
##   <chr>                    <dbl>                 <dbl>       <dbl> <dbl>
## 1 Zona 2               -1.82e-12                -0.466      0.217  0.466
## 2 Zone 1               -1.59e-12                -0.179      0.0322 0.179
## 3 Zone 3                1.94e-12                 1.85       3.41   1.85 
## 4 Zone 4               -1.48e-13                 0.411      0.169  0.411
print(mean_difference_by_zone)
## # A tibble: 4 × 2
##   zone   mean_difference
##   <chr>            <dbl>
## 1 Zona 2          -2.28 
## 2 Zone 1          -4.39 
## 3 Zone 3          -3.35 
## 4 Zone 4          -0.130
observations_per_zone <- data_post_policy %>%
  count(zone)

print(observations_per_zone)
## # A tibble: 4 × 2
##   zone       n
##   <chr>  <int>
## 1 Zona 2  4206
## 2 Zone 1  7010
## 3 Zone 3  4206
## 4 Zone 4  1402

I repeat the analysis only with the post-COVID-19 data to perform a robustness analysis. This is necessary because the pandemic may have significantly altered the dynamics of key variables, such as air quality and traffic patterns. By focusing the analysis on the post-COVID period, we verify whether the original conclusions remain valid in a different context, ensuring the consistency and reliability of the results obtained.

  1. DIFFERENCES IN DIFFERENCES APPROACH BEFORE COVID-19
df_combined <- read_csv("df_combined_complete.csv")
## Rows: 121693 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): town_code, month, dia, station_name, address, station_type, nombr...
## dbl  (22): provincia, station_code.x, punto_muestreo, year, air_quality, cit...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_combined <- df_combined |>  drop_na(air_quality)

df_combined <- df_combined |> 
  filter(fecha <= as.Date("2020-02-01"))


df_combined <- df_combined %>%
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zone 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    station_name %in% c("Valdemoro", "Arganda del Rey", "Collado Villalba", "Algete", "Colmenar Viejo", "Alcala de Henares", "San Martin de Valdeiglesias", "Villa del Prado") ~ "Control",
    TRUE ~ "Excluded"
  ))

3.1 MADRID MUNICIPALITY BEFORE COVID-19

df_combined <- df_combined %>%
  filter(zone != "Excluded")

model0 <- feols(air_quality ~ city_center + post_treat_mad_central + city_center*post_treat_mad_central + prec + dir + velmedia + racha +year + zone,  data = df_combined, cluster = ~zone)
## The variable 'zoneZone 4' has been removed because of collinearity (see $collin.var).
summary(model0)
## OLS estimation, Dep. Var.: air_quality
## Observations: 36,063
## Standard-errors: Clustered (zone) 
##                                       Estimate Std. Error    t value   Pr(>|t|)
## (Intercept)                        8704.547787 325.824846  26.715421 1.1670e-05
## city_center                          -6.505688   0.735674  -8.843169 9.0276e-04
## post_treat_mad_central                6.461463   0.213520  30.261679 7.1027e-06
## prec                                  0.023889   0.056481   0.422964 6.9407e-01
## dir                                   0.018242   0.009109   2.002678 1.1576e-01
## velmedia                             -2.598495   0.217360 -11.954781 2.8054e-04
## racha                                -1.409307   0.370445  -3.804367 1.9032e-02
## year                                 -4.293628   0.162444 -26.431471 1.2177e-05
## zoneZone 1                           23.782929   0.363100  65.499647 3.2548e-07
## zoneZone 2                           24.647735   0.389773  63.236053 3.7460e-07
## zoneZone 3                           20.264191   0.798603  25.374548 1.4324e-05
## city_center:post_treat_mad_central   -1.564799   0.780581  -2.004660 1.1550e-01
##                                       
## (Intercept)                        ***
## city_center                        ***
## post_treat_mad_central             ***
## prec                                  
## dir                                   
## velmedia                           ***
## racha                              *  
## year                               ***
## zoneZone 1                         ***
## zoneZone 2                         ***
## zoneZone 3                         ***
## city_center:post_treat_mad_central    
## ... 1 variable was removed because of collinearity (zoneZone 4)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.3   Adj. R2: 0.377892

3.2 Assesing heterogeneity treatment effects. Differences in differences approach by zone before COVID-19

df_combined<- df_combined |> 
  mutate(year = as.factor(year), zone = as.factor(zone), post = as.factor(post_treat_mad_central))

df_combined <- df_combined |> 
  mutate(
    post = as.numeric(as.character(post)),
    year = as.numeric(as.character(year))
  )

# Crear variables de tratamiento para cada zona
df_combined <- df_combined |> 
  mutate(
    treatment_zone1 = ifelse(zone == "Zone 1", 1, 0),
    treatment_zone2 = ifelse(zone == "Zone 2", 1, 0),
    treatment_zone3 = ifelse(zone == "Zone 3", 1, 0),
    treatment_zone4 = ifelse(zone == "Zone 4", 1, 0))


df_combined <- df_combined %>%
  mutate(
    POST_treat_1 = post * treatment_zone1,
    POST_treat_2 = post * treatment_zone2,
    POST_treat_3 = post * treatment_zone3,
    POST_treat_4 = post * treatment_zone4,
  )
model1 <- feols(air_quality ~ post + treatment_zone1 + treatment_zone2 + treatment_zone3 + treatment_zone4 + POST_treat_1  + POST_treat_2 + POST_treat_3 + POST_treat_4 + prec + dir + velmedia + racha + year,  data = df_combined, cluster = ~zone)
summary(model1)
## OLS estimation, Dep. Var.: air_quality
## Observations: 36,063
## Standard-errors: Clustered (zone) 
##                    Estimate Std. Error    t value   Pr(>|t|)    
## (Intercept)     8706.282701 324.529304  26.827416 1.1477e-05 ***
## post               6.462936   0.212669  30.389699 6.9842e-06 ***
## treatment_zone1   17.775248   0.385205  46.144952 1.3192e-06 ***
## treatment_zone2   17.694147   0.354669  49.889121 9.6597e-07 ***
## treatment_zone3   13.687691   0.168527  81.219434 1.3774e-07 ***
## treatment_zone4   -7.416719   0.711130 -10.429488 4.7746e-04 ***
## POST_treat_1      -2.868307   0.158398 -18.108271 5.4685e-05 ***
## POST_treat_2      -0.384260   0.164323  -2.338442 7.9514e-02 .  
## POST_treat_3      -1.373467   0.336740  -4.078714 1.5114e-02 *  
## POST_treat_4       0.835244   0.186240   4.484773 1.0950e-02 *  
## prec               0.024003   0.056550   0.424460 6.9306e-01    
## dir                0.018071   0.009051   1.996602 1.1657e-01    
## velmedia          -2.596305   0.217924 -11.913809 2.8433e-04 ***
## racha             -1.409408   0.370681  -3.802210 1.9067e-02 *  
## year              -4.294488   0.161803 -26.541462 1.1977e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2   Adj. R2: 0.378484
  1. Counterfactual Estimation Using Predictive Models Before COVID-19
data_post_policy<- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV")
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): dia, station_name, station_type
## dbl  (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date  (1): fecha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_post_policy <- data_post_policy |> 
  filter(fecha <= as.Date("2020-02-01"))


data_post_policy <- data_post_policy |> 
  mutate(zone = case_when(
    station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla", 
                        "Ramón y Cajal", "Cuatro Caminos", "Plaza de España", 
                        "Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro", 
                        "Parque del Retiro") ~ "Zone 1",
    station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
    station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada", 
                        "Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
    station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
    TRUE ~ NA_character_
  ))

# Agrupar por zona, año y mes, y calcular medias mensuales
data_post_policy_monthly <- data_post_policy |> 
  group_by(year, month, zone) |> 
  summarise(
    mean_air_quality = mean(air_quality, na.rm = TRUE),
    mean_predicted_air_quality = mean(predicted_air_quality, na.rm = TRUE),
    .groups = 'drop'
  )

# Calcular la diferencia mensual
data_with_difference_monthly <- data_post_policy_monthly |> 
  mutate(
    difference_air_quality_predicted = mean_air_quality - mean_predicted_air_quality
  )

4.1 AVERAGE TREATMENT ON TREATED BEFORE COVID-19

mean_difference_general <- data_with_difference_monthly |> 
  summarise(
    mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
  )
print(mean_difference_general)
## # A tibble: 1 × 1
##   mean_difference
##             <dbl>
## 1           -3.17

4.2 CONDITIONAL AVERAGE TREATMENT ON TREATED BY ZONE BEFORE COVID-19

mean_difference_by_zone <- data_with_difference_monthly |> 
  group_by(zone) |> 
  summarise(
    mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE),
    .groups = 'drop'
  )

print(mean_difference_by_zone)
## # A tibble: 4 × 2
##   zone   mean_difference
##   <chr>            <dbl>
## 1 Zona 2          -3.00 
## 2 Zone 1          -5.52 
## 3 Zone 3          -3.33 
## 4 Zone 4          -0.824
observations_per_zone <- data_post_policy %>%
  count(zone)

# Mostrar los resultados
print(observations_per_zone)
## # A tibble: 4 × 2
##   zone       n
##   <chr>  <int>
## 1 Zona 2  2574
## 2 Zone 1  4290
## 3 Zone 3  2574
## 4 Zone 4   858